library(tidyverse)
library(sf)
library(raster)
library(kableExtra)
library(tidycensus)
library(tigris)
library(FNN)
library(caret)
library(yardstick)
library(plotROC) 
library(ggrepel)
library(pROC)
library(grid)
library(gridExtra)
library(viridis)
library(igraph)
library(mapview)
library(FedData)
library(terra)
library(classInt)       # For classification
library(mapview)        # For interactive maps
library(ggplot2)
library(knitr)
library(ROCR)
library(mapedit)  
palette2 <- c("#41b6c4","#253494")
palette4 <- c("#a1dab4","#41b6c4","#2c7fb8","#253494")
palette5 <- c("#ffffcc","#a1dab4","#41b6c4","#2c7fb8","#253494")
palette10 <- c("#f7fcf0","#e0f3db","#ccebc5","#a8ddb5","#7bccc4",
               "#4eb3d3","#2b8cbe","#0868ac","#084081","#f7fcf0")
# Custom helper functions used in urban growth modeling
quintileBreaks <- function(df,variable) {
    as.character(quantile(df[[variable]],
                          c(.01,.2,.4,.6,.8),na.rm=T))
}

xyC <- function(aPolygonSF) {
  as.data.frame(
    cbind(x=st_coordinates(st_centroid(aPolygonSF))[,1],
          y=st_coordinates(st_centroid(aPolygonSF))[,2]))
} 

rast <- function(inRaster) {
  data.frame(
    xyFromCell(inRaster, 1:ncell(inRaster)), 
    value = getValues(inRaster)) }

nn_function <- function(measureFrom,measureTo,k) {
  measureFrom_Matrix <- as.matrix(measureFrom)
  measureTo_Matrix <- as.matrix(measureTo)
  nn <- get.knnx(measureTo, measureFrom, k)$nn.dist
  output <- as.data.frame(nn) %>%
    rownames_to_column(var = "thisPoint") %>%
    gather(points, point_distance, V1:ncol(.)) %>%
    arrange(as.numeric(thisPoint)) %>%
    group_by(thisPoint) %>%
    summarize(pointDistance = mean(point_distance)) %>%
    arrange(as.numeric(thisPoint)) %>% 
    dplyr::select(-thisPoint) %>%
    pull()
  return(output)  
}

aggregateRaster <- function(inputRasterList, theFishnet) {
  theseFishnets <- theFishnet %>% dplyr::select()
  for (i in inputRasterList) {
    varName <- names(i)
    thesePoints <-
      rasterToPoints(i) %>%
      as.data.frame() %>%
      st_as_sf(coords = c("x", "y"), crs = st_crs(theFishnet)) %>%
      filter(.[[1]] == 1)
    thisFishnet <-
      aggregate(thesePoints, theFishnet, length) %>%
      mutate(!!varName := ifelse(is.na(.[[1]]),0,1))
    theseFishnets <- cbind(theseFishnets,thisFishnet)
  }
  return(theseFishnets)
}
#import land cover data in 2011 and 2021
lc_2011 <- terra::rast("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/data/land cover/2011.tif")
lc_2021 <- terra::rast("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/data/land cover/2021.tif")

# import road data in 2011 and 2021
roads_2011 <- vect("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/tl_2011_48_pri_FeaturesToJSO.json")
roads_2021 <- vect("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/tl_2021_48_pri_FeaturesToJSO.json")

# MSA boundary  
msa_boundary <- vect("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/bastrop_county_FeaturesToJSO.json")  
msa_boundary <- project(msa_boundary, crs(lc_2011))
roads_2011 <- project(roads_2011, crs(lc_2011))
roads_2021 <- project(roads_2021, crs(lc_2011))

# Clip raster and vectors to Austin MSA boundary
lc_2011_austin <- mask(crop(lc_2011, msa_boundary), msa_boundary)
lc_2021_austin <- mask(crop(lc_2021, msa_boundary), msa_boundary)
roads_2011_austin <- crop(roads_2011, msa_boundary)
roads_2021_austin <- crop(roads_2021, msa_boundary)
austinMSA <- msa_boundary %>% sf::st_as_sf()
austinMSA <- st_transform(austinMSA, 2278)  # NAD83 / Texas Central (ftUS)
# Convert terra SpatRaster to raster format (if needed for compatibility)
lc_2011_r <- raster::raster(lc_2011_austin)
lc_2021_r <- raster::raster(lc_2021_austin)

# Resample to 30x coarser resolution using modal value (for categorical data)
lc_2011_rs <- aggregate(lc_2011_r, fact = 30, fun = "modal")
lc_2021_rs <- aggregate(lc_2021_r, fact = 30, fun = "modal")

# Reclassify to Developed (1) vs. Undeveloped (0)
# Define reclassification: Developed classes = 13 to 24
reclassMatrix <- matrix(c(
  0, 12, 0,
  12, 24, 1,
  24, Inf, 0),
  ncol = 3, byrow = TRUE
)

# Reclassify each land cover raster
developed_2011 <- reclassify(lc_2011_rs, reclassMatrix)
developed_2021 <- reclassify(lc_2021_rs, reclassMatrix)

# Detect Development Change
# Add rasters together: 0 = undeveloped both, 1 = changed, 2 = developed both
development_change <- developed_2011 + developed_2021

# Histogram to inspect values
hist(development_change, main = "Land Cover Change (2011–2021)",
     xlab = "0: No Dev | 1: Change | 2: Dev Both", col = "lightblue")

# Isolate Only Changed Cells (value = 1)
# Set values that are NOT 1 (i.e., no change or dev both) to NA
development_change[development_change != 1] <- NA

# Visualize Land Cover Change
mapView(development_change, layer.name = "Development Change (2011–2021)")
# Create fishnet grid over the Austin MSA at same resolution & CRS as development raster
# Convert msa_boundary to sf format
msa_boundary_sf <- sf::st_as_sf(msa_boundary)

# Then create the fishnet
austin_fishnet <- 
  st_make_grid(msa_boundary_sf %>%
                 st_transform(crs(development_change)),
               cellsize = res(development_change)[1],
               square = TRUE) %>%
  st_sf() %>%
  st_intersection(
    msa_boundary_sf %>%
      dplyr::select(geometry) %>%
      st_transform(crs(development_change))
  ) %>%
  mutate(uniqueID = rownames(.))
# Reclassify NLCD codes into new land cover categories

developed_2011 <- lc_2011_rs %in% c(21, 22, 23, 24)
forest_2011 <- lc_2011_rs %in% c(41, 42, 43)
farm_2011 <- lc_2011_rs %in% c(81, 82)
wetlands_2011 <- lc_2011_rs %in% c(90, 95)
otherUndeveloped_2011 <- lc_2011_rs %in% c(52, 71, 31)
water_2011 <- lc_2011_rs == 11

developed_2021 <- lc_2021_rs %in% c(21, 22, 23, 24)
forest_2021 <- lc_2021_rs %in% c(41, 42, 43)
farm_2021 <- lc_2021_rs %in% c(81, 82)
wetlands_2021 <- lc_2021_rs %in% c(90, 95)
otherUndeveloped_2021 <- lc_2021_rs %in% c(52, 71, 31)
water_2021 <- lc_2021_rs == 11

# Assign names for aggregation
names(developed_2011) <- "developed_2011"
names(forest_2011) <- "forest_2011"
names(farm_2011) <- "farm_2011"
names(wetlands_2011) <- "wetlands_2011"
names(otherUndeveloped_2011) <- "otherUndeveloped_2011"
names(water_2011) <- "water_2011"

names(developed_2021) <- "developed_2021"
names(forest_2021) <- "forest_2021"
names(farm_2021) <- "farm_2021"
names(wetlands_2021) <- "wetlands_2021"
names(otherUndeveloped_2021) <- "otherUndeveloped_2021"
names(water_2021) <- "water_2021"

# Create raster lists
rasterList_2011 <- c(developed_2011, forest_2011, farm_2011,
                     wetlands_2011, otherUndeveloped_2011, water_2011)

rasterList_2021 <- c(developed_2021, forest_2021, farm_2021,
                     wetlands_2021, otherUndeveloped_2021, water_2021)

# Aggregate 2011 land cover categories to fishnet
lcRasters_2011 <- aggregateRaster(rasterList_2011, austin_fishnet) %>%
  dplyr::select(developed_2011,
                forest_2011,
                farm_2011,
                wetlands_2011,
                otherUndeveloped_2011,
                water_2011) %>%
  mutate_if(is.numeric, as.factor)

# Aggregate 2021 land cover categories to fishnet (if needed for visual/validation)
lcRasters_2021 <- aggregateRaster(rasterList_2021, austin_fishnet) %>%
  dplyr::select(developed_2021,
                forest_2021,
                farm_2021,
                wetlands_2021,
                otherUndeveloped_2021,
                water_2021) %>%
  mutate_if(is.numeric, as.factor)

# Convert raster to points, then join to fishnet
changePoints <- rasterToPoints(development_change) %>%
  as.data.frame() %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(austin_fishnet))

# Summarize change points to each fishnet cell
fishnet <- aggregate(changePoints, austin_fishnet, FUN = sum) %>%
  mutate(development_change = ifelse(is.na(layer), 0, 1),
         development_change = as.factor(development_change)) %>%
  dplyr::select(-layer)
# First, create a new base fishnet with uniqueID (if you haven't)
fishnet <- austin_fishnet %>%
  mutate(uniqueID = as.character(rownames(.)))

# Then, make sure lcRasters_2011 has the same uniqueID column in the same order
lcRasters_2011 <- lcRasters_2011 %>%
  mutate(uniqueID = as.character(rownames(.)))

# Now you can safely join
fishnet <- left_join(fishnet, st_drop_geometry(lcRasters_2011), by = "uniqueID")
# Plot land cover types in 2011 to verify aggregation
lcRasters_2011 %>%
  st_centroid() %>%
  gather(key = "variable", value = "value", developed_2011:water_2011) %>% 
  mutate(X = xyC(.)$x, Y = xyC(.)$y) %>%
  ggplot() +
    geom_sf(data = austinMSA) +
    geom_point(aes(X, Y, colour = as.factor(value))) +
    facet_wrap(~variable) +
    scale_colour_manual(values = palette2,
                        labels = c("Other", "Land Cover"),
                        name = "") +
    labs(title = "Land Cover Types, 2011 (Austin Fishnet)",
         subtitle = "As fishnet centroids") +
    theme_void()
## Warning: st_centroid assumes attributes are constant over geometries

### This 6-panel map displays the spatial distribution of major land cover types within the Austin metropolitan statistical area as of 2011, mapped to a fishnet grid structure. Categories include developed land, farmland, forest, other undeveloped areas, wetlands, and water. These maps provide a baseline understanding of landscape composition and land availability prior to urban expansion, supporting analysis of conversion patterns and suitability for future development.

section 6

# Tract Boundaries
tracts_2011 <- st_read("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/2011_census_tract.json")
## Reading layer `2011_census_tract' from data source 
##   `/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/2011_census_tract.json' 
##   using driver `ESRIJSON'
## Simple feature collection with 5265 features and 13 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -106.6456 ymin: 25.83716 xmax: -93.50804 ymax: 36.5007
## Geodetic CRS:  NAD83
tracts_2021 <- st_read("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/2021_census_tract.json")
## Reading layer `2021_census_tract' from data source 
##   `/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/2021_census_tract.json' 
##   using driver `ESRIJSON'
## Simple feature collection with 6896 features and 13 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -106.6456 ymin: 25.83716 xmax: -93.50804 ymax: 36.5007
## Geodetic CRS:  NAD83
# Define FIPS codes for your counties
austin_fips <- c("48453", "48491", "48209", "48021", "48055")

# Filter tract shapefiles to only Austin MSA counties
tracts_2011 <- tracts_2011 %>% filter(substr(GEOID, 1, 5) %in% austin_fips)
tracts_2021 <- tracts_2021 %>% filter(substr(GEOID, 1, 5) %in% austin_fips)

library(dplyr)
library(readr)
library(stringr)

# Define the directory and file patterns
pop_dir <- "/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/pop"

# Define the county names and years
counties <- c("travis", "williamson", "hays", "bastrop", "caldwell")
years <- c("2011", "2021")

read_clean_pop <- function(county, year) {
  file_path <- file.path(pop_dir, paste0(county, "_", year, ".csv"))
  
  read_csv(file_path, skip = 1, show_col_types = FALSE) %>%
    filter(str_starts(Geography, "1400000US")) %>%  # ✅ filter real tract rows
    transmute(
      NAME = `Geographic Area Name`,
      GEOID_raw = Geography,
      pop = as.numeric(`Estimate!!Total`),
      GEOID = str_extract(Geography, "\\d{11}$"),  # Only final 11 digits
      county = str_to_title(county),
      year = year
    )
}

# Read and bind data for each year
pop2011_all <- bind_rows(lapply(counties, read_clean_pop, year = "2011"))
## New names:
## New names:
## New names:
## New names:
## New names:
## • `` -> `...5`
pop2021_all <- bind_rows(lapply(counties, read_clean_pop, year = "2021"))
## New names:
## New names:
## New names:
## New names:
## New names:
## • `` -> `...5`
tracts_2011 <- left_join(tracts_2011, pop2011_all, by = "GEOID")
tracts_2021 <- left_join(tracts_2021, pop2021_all, by = "GEOID")
# Custom palette
palette5 <- c("#ffffcc", "#a1dab4", "#41b6c4", "#2c7fb8", "#253494")

# Plot for 2011
p2011 <- ggplot() +
  geom_sf(data = tracts_2011, aes(fill = factor(ntile(pop, 5))), color = NA) +
  scale_fill_manual(values = palette5,
                    labels = quantile(tracts_2011$pop, probs = seq(0, 1, 0.2), na.rm = TRUE),
                    name = "Quintile\nBreaks") +
  labs(title = "Population, Austin MSA: 2011") +
  theme_void()

# Plot for 2021
p2021 <- ggplot() +
  geom_sf(data = tracts_2021, aes(fill = factor(ntile(pop, 5))), color = NA) +
  scale_fill_manual(values = palette5,
                    labels = quantile(tracts_2021$pop, probs = seq(0, 1, 0.2), na.rm = TRUE),
                    name = "Quintile\nBreaks") +
  labs(title = "Population, Austin MSA: 2021") +
  theme_void()

# Display side by side
gridExtra::grid.arrange(p2011, p2021, ncol = 2)

### This figure compares population distribution across the Austin metropolitan area in 2011 and 2021, classified by quintiles. Darker shades represent census tracts with higher population counts. The maps visually capture demographic shifts over the decade, particularly increased density in central Austin and surrounding high-growth zones. These patterns are critical for understanding development pressure and guiding infrastructure planning.

# Make sure tracts are in the same CRS as the fishnet
tracts_2011 <- st_transform(tracts_2011, st_crs(austin_fishnet))
tracts_2021 <- st_transform(tracts_2021, st_crs(austin_fishnet))

# Interpolate tract population data to fishnet grid for 2011
fishnetPopulation2011 <- 
  st_interpolate_aw(tracts_2011["pop"], austin_fishnet, extensive = TRUE) %>%
  st_centroid() %>%
  st_join(austin_fishnet, ., join = st_intersects) %>%
  mutate(pop_2011 = replace_na(pop, 0)) %>%
  dplyr::select(pop_2011)
## Warning in st_interpolate_aw.sf(tracts_2011["pop"], austin_fishnet, extensive =
## TRUE): st_interpolate_aw assumes attributes are constant or uniform over areas
## of x
## Warning: st_centroid assumes attributes are constant over geometries
# Interpolate tract population data to fishnet grid for 2021
fishnetPopulation2021 <- 
  st_interpolate_aw(tracts_2021["pop"], austin_fishnet, extensive = TRUE) %>%
  st_centroid() %>%
  st_join(austin_fishnet, ., join = st_intersects) %>%
  mutate(pop_2021 = replace_na(pop, 0)) %>%
  dplyr::select(pop_2021)
## Warning in st_interpolate_aw.sf(tracts_2021["pop"], austin_fishnet, extensive =
## TRUE): st_interpolate_aw assumes attributes are constant or uniform over areas
## of x
## Warning in st_interpolate_aw.sf(tracts_2021["pop"], austin_fishnet, extensive =
## TRUE): st_centroid assumes attributes are constant over geometries
library(gridExtra)

# Plot: 2011 Tract Population
pop_plot_tract_2011 <- ggplot() +
  geom_sf(data = tracts_2011, aes(fill = factor(ntile(pop, 5))), color = NA) +
  scale_fill_manual(
    values = palette5,
    labels = substr(quintileBreaks(tracts_2011, "pop"), 1, 4),
    name = "Quintile\nBreaks"
  ) +
  labs(
    title = "Population, Austin MSA: 2011",
    subtitle = "Represented as census tracts"
  ) +
  theme_void()

# Plot: 2011 Fishnet Interpolated Population
pop_plot_fishnet_2011 <- ggplot() +
  geom_sf(data = fishnetPopulation2011, aes(fill = factor(ntile(pop_2011, 5))), color = NA) +
  scale_fill_manual(
    values = palette5,
    labels = substr(quintileBreaks(fishnetPopulation2011, "pop_2011"), 1, 4),
    name = "Quintile\nBreaks"
  ) +
  labs(
    title = "Population, Austin MSA: 2011",
    subtitle = "Represented as interpolated fishnet grid"
  ) +
  theme_void()

# Combine the two plots
grid.arrange(pop_plot_tract_2011, pop_plot_fishnet_2011, ncol = 2)

### fugure. Population Distribution, Austin MSA (2011): Census Tracts vs. Interpolated Grid

# Visualize 2021 Tract vs Fishnet Interpolated Population
grid.arrange(
  ggplot() +
    geom_sf(data = tracts_2021, aes(fill = factor(ntile(pop, 5))), colour = NA) +
    scale_fill_manual(values = palette5,
                      labels = substr(quintileBreaks(tracts_2021, "pop"), 1, 4),
                      name = "Quintile\nBreaks") +
    labs(title = "Population, Austin MSA: 2021",
         subtitle = "Represented as Tracts (raw)") +
    theme_void(),

  ggplot() +
    geom_sf(data = fishnetPopulation2021, aes(fill = factor(ntile(pop_2021, 5))), colour = NA) +
    scale_fill_manual(values = palette5,
                      labels = substr(quintileBreaks(fishnetPopulation2021, "pop_2021"), 1, 4),
                      name = "Quintile\nBreaks") +
    labs(title = "Population, Austin MSA: 2021",
         subtitle = "Interpolated to Fishnet Grid") +
    theme_void(),
  ncol = 2
)

### figure. Population Distribution, Austin MSA (2021): Census Tracts vs. Interpolated Grid

section 7

# 7.1 Define and subset highways for Austin MSA
austinHighways <- roads_2011 %>%
  st_as_sf() %>%
  st_transform(st_crs(austinMSA)) %>%
  st_intersection(st_geometry(austinMSA)) %>%
  st_transform(st_crs(austin_fishnet))
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
# 7.1 FIX: Make sure development_change exists in fishnet
# Re-run this only if you lost the column previously
# Note: development_change was calculated by summing raster points and aggregating to the grid

# Re-aggregate dev change from raster to fishnet (just to be sure)
changePoints <- rasterToPoints(development_change) %>%
  as.data.frame() %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(austin_fishnet))

fishnet <- aggregate(changePoints, austin_fishnet, FUN = sum) %>%
  mutate(development_change = ifelse(is.na(layer), 0, 1),
         development_change = factor(development_change, levels = c(0,1), labels = c("No Change", "New Development"))) %>%
  dplyr::select(-layer)

# 7.2 Visualize development change and highways using geometry-aware `geom_sf()`
ggplot() +
  geom_sf(data = fishnet, aes(fill = development_change), color = NA) +
  geom_sf(data = austinHighways, color = "black", size = 0.4) +
  scale_fill_manual(values = palette2) +
  labs(title = "New Development and Highways (2011–2021)",
       subtitle = "Austin Fishnet Cells with Primary Roads") +
  theme_void()

### the spatial relationship between primary highways and new development across the Austin Metropolitan Statistical Area (MSA). Using a fishnet grid, areas marked in dark blue represent new development from 2011 to 2021, overlaid with primary road networks. The visualization highlights how development tends to cluster around major transportation corridors, supporting the inclusion of road proximity as a key predictor in the urban growth model.

# 7.2.1 Calculate Distance from Fishnet Centroids to Highways
highwayPoints_fishnet_2011 <- austin_fishnet %>%
  st_centroid() %>%
  mutate(distance_highways_2011 = as.numeric(
    st_distance(., st_union(austinHighways) %>%
                  st_transform(st_crs(austin_fishnet)))
  )) %>%
  as.data.frame() %>%
  dplyr::select(-geometry) %>%
  left_join(austin_fishnet, ., by = "uniqueID") %>%
  st_as_sf()
## Warning: st_centroid assumes attributes are constant over geometries
# 7.2.2 Assume 2021 network = 2011 (duplicate)
highwayPoints_fishnet_2021 <- highwayPoints_fishnet_2011 %>%
  rename(distance_highways_2021 = distance_highways_2011)

ggplot() +
  geom_sf(data = austinMSA %>% st_transform(st_crs(highwayPoints_fishnet_2011))) +
  geom_point(data = highwayPoints_fishnet_2011,
             aes(x = xyC(highwayPoints_fishnet_2011)[,1],
                 y = xyC(highwayPoints_fishnet_2011)[,2],
                 colour = factor(ntile(distance_highways_2011, 5))),
             size = 1.5) +
  scale_colour_manual(
    values = palette5,
    labels = substr(quintileBreaks(highwayPoints_fishnet_2011, "distance_highways_2011"), 1, 8),
    name = "Quintile\nBreaks"
  ) +
  geom_sf(data = austinHighways, colour = "red") +
  labs(title = "Distance to Highways (ft)",
       subtitle = "Fishnet Centroids; Highways visualized in red") +
  theme_void()
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries

### Euclidean distance from each fishnet centroid to the nearest primary highway within the Austin Metropolitan Statistical Area. Red lines represent major highways, while the color gradient shows increasing distance from these roads

new proposed highway as predictor

# 1. Load your new highway file as sf
new_highway <- st_read("/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/proposed_1highway.geojson")
## Reading layer `proposed_1highway' from data source 
##   `/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/proposed_1highway.geojson' 
##   using driver `ESRIJSON'
## Simple feature collection with 1 feature and 2 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -10877710 ymin: 3550515 xmax: -10868470 ymax: 3617404
## Projected CRS: WGS 84 / Pseudo-Mercator
# 2. Ensure it matches CRS of fishnet
new_highway <- st_transform(new_highway, st_crs(austin_fishnet))

# 3. Calculate distance from centroids to the highway
fishnet_centroids <- st_centroid(austin_fishnet)
## Warning: st_centroid assumes attributes are constant over geometries
fishnet_centroids$distance_new_highway <- as.numeric(
  st_distance(fishnet_centroids, st_union(new_highway))
)

#section 8

# Objective:
# The code calculates the spatial lag to the nearest two developed cells in 2011 and 2021 from the centroids of a fishnet grid, and then visualizes the 2011 lag in a choropleth-style plot using quintile classification.

# Calculate spatial lag to 2 nearest developed cells in 2011
fishnet$lagDevelopment_2011 <-
  nn_function(
    fishnet %>%
      st_centroid() %>%
      st_coordinates() %>%
      as.data.frame(),
    lcRasters_2011 %>%
      filter(developed_2011 == 1) %>%
      st_centroid() %>%
      st_coordinates() %>%
      as.data.frame(),
    k = 2
  )
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
# Duplicate for 2021 using the same assumption (or adapt if new dev data used)
fishnet$lagDevelopment_2021 <-
  nn_function(
    fishnet %>%
      st_centroid() %>%
      st_coordinates() %>%
      as.data.frame(),
    lcRasters_2021 %>%
      filter(developed_2021 == 1) %>%
      st_centroid() %>%
      st_coordinates() %>%
      as.data.frame(),
    k = 2
  )
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
ggplot() +
  geom_sf(data = austinMSA %>% st_transform(st_crs(fishnet))) +
  geom_point(data = fishnet,
             aes(x = xyC(fishnet)[,1],
                 y = xyC(fishnet)[,2],
                 colour = factor(ntile(lagDevelopment_2011, 5))),
             size = 1.5) +
  scale_colour_manual(values = palette5,
                      labels = substr(quintileBreaks(fishnet, "lagDevelopment_2011"), 1, 7),
                      name = "Quintile\nBreaks") +
  labs(title = "Spatial Lag to 2011 Development (ft)",
       subtitle = "Measured from fishnet centroids to nearest developed cells") +
  theme_void()
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries

### the spatial lag to prior urban development in the Austin MSA by showing the average distance (in feet) from each fishnet centroid to its two nearest developed cells in 2011. Quintile classification reveals spatial patterns in proximity, highlighting areas with strong development momentum (lighter colors) versus more isolated or undeveloped regions (darker shades). This metric is critical in understanding contagion effects and diffusion-based urban growth.

section 9

# Define the directory containing your county geojson files
county_dir <- "/Users/1m/Library/CloudStorage/OneDrive-PennO365/25spring/planning and env modelling/Austin/r_import/county"

# Define the county file name roots (all lowercase to match filenames)
county_names <- c("bastrop", "caldwell", "hays", "travis", "williamson")

# Function to read and tag each county
read_county <- function(county) {
  file_path <- file.path(county_dir, paste0(tools::toTitleCase(county), "_County.geojson"))
  st_read(file_path, quiet = TRUE) %>%
    mutate(NAME = tools::toTitleCase(county))  # Add county name if not present
}

# Read and bind all counties
studyAreaCounties <- bind_rows(lapply(county_names, read_county)) %>%
  st_transform(st_crs(austin_fishnet))  # Match CRS

countyFishnet <- austin_fishnet %>%
  st_join(studyAreaCounties %>% dplyr::select(NAME)) %>%
  as.data.frame() %>%
  dplyr::select(uniqueID, NAME) %>%
  left_join(austin_fishnet, .) %>%
  st_as_sf() %>%
  group_by(uniqueID) %>%
  slice(1) %>%
  ungroup() %>%
  arrange(as.numeric(uniqueID))
## Joining with `by = join_by(uniqueID)`

section 10

# Combine t1: 2011-based modeling dataset
dat_2011 <- 
  cbind(fishnet, highwayPoints_fishnet_2011, fishnetPopulation2011, lcRasters_2011, countyFishnet) %>%
  as.data.frame() %>%
  dplyr::select(uniqueID, development_change, lagDevelopment_2011, distance_highways_2011, pop_2011,  
                developed_2011, forest_2011, farm_2011, wetlands_2011, otherUndeveloped_2011, water_2011,
                NAME, geometry) %>%
  filter(water_2011 == 0) %>%
  rename_with(~ str_remove(.x, "_2011"))

# Combine t2: 2021-based forecasting dataset
dat_2021 <- 
  cbind(fishnet, highwayPoints_fishnet_2021, fishnetPopulation2021, lcRasters_2021, countyFishnet) %>%
  as.data.frame() %>%
  dplyr::select(uniqueID, development_change, lagDevelopment_2021, distance_highways_2021, pop_2021,  
                developed_2021, forest_2021, farm_2021, wetlands_2021, otherUndeveloped_2021, water_2021,
                NAME, geometry) %>%
  filter(water_2021 == 0)  %>%
  rename_with(~ str_remove(.x, "_2021"))
# ---- Integrate New Highway Distance into 2031 Dataset ----
# Assumes fishnet_newhighway has columns "uniqueID" and "distance_new_highway"

# 4. Create a table with just ID and distance
new_hw_distances <- fishnet_centroids %>%
  st_drop_geometry() %>%
  dplyr::select(uniqueID, distance_new_highway)

# 5. Join this distance to dat_2021 by uniqueID
dat_2021 <- dat_2021 %>%
  left_join(new_hw_distances, by = "uniqueID")

# 6. Confirm success
summary(dat_2021$distance_new_highway)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     2.39 18989.96 35358.88 35882.37 52790.49 80967.65

section 11

dat_2011 %>%
  dplyr::select(distance_highways, lagDevelopment, development_change, pop) %>%
  gather(Variable, Value, -development_change) %>%
  ggplot(aes(x = development_change, y = Value, fill = development_change)) + 
  geom_bar(position = "dodge", stat = "summary", fun = mean) +
  facet_wrap(~Variable, scales = "free") +
  scale_fill_manual(values = palette2,
                    labels = c("No Change", "New Development"),
                    name = "Mean Value") +
  labs(title = "New Development as a Function of Continuous Variables",
       x = "Development Change (0 = No, 1 = Yes)",
       y = "Mean Value") +
  theme_minimal()

### compares the average values of three continuous predictors—distance to highways, spatial lag to existing development, and population—between areas that experienced development (1) and those that did not (0) in the Austin MSA. New development areas tend to have lower distances to highways and prior development, and higher population values, supporting the role of accessibility and density in driving urban expansion.

# Convert land cover variables from factor to numeric (0/1)
dat_2011 <- dat_2011 %>%
  mutate(
    forest = as.numeric(as.character(forest)),
    farm = as.numeric(as.character(farm)),
    wetlands = as.numeric(as.character(wetlands)),
    otherUndeveloped = as.numeric(as.character(otherUndeveloped))
  )
dat_2011 %>%
  dplyr::select(development_change, forest, farm, wetlands, otherUndeveloped) %>%
  gather(key = "Land_Cover_Type", Value, -development_change) %>%
  group_by(development_change, Land_Cover_Type) %>%
  summarize(n = sum(Value), .groups = 'drop') %>%
  ungroup() %>%
  mutate(Conversion_Rate = paste0(round(100 * n / sum(n), 2), "%")) %>%
  filter(development_change == "New Development") %>%
  dplyr::select(Land_Cover_Type, Conversion_Rate) %>%
  kable() %>%
  kable_styling(full_width = FALSE)
Land_Cover_Type Conversion_Rate
farm 0.6%
forest 0.96%
otherUndeveloped 1.97%
wetlands 0.01%

This table presents the percentage of each land cover type in 2011 that was converted to new development by 2021 in the Austin MSA. The highest conversion occurred in areas labeled “Other Undeveloped” (1.98%), followed by forested land (0.96%) and farmland (0.61%), while wetlands were least affected (0.01%), indicating effective protection or physical development constraints.

section 12

library(caret)

set.seed(3456)
trainIndex <- createDataPartition(dat_2011$otherUndeveloped, p = 0.7, list = FALSE, times = 1)
datTrain <- dat_2011[trainIndex, ]
datTest <- dat_2011[-trainIndex, ]
Model1 <- glm(development_change ~ wetlands + forest + farm + otherUndeveloped,
              family = binomial(link = "logit"), data = datTrain)

Model2 <- glm(development_change ~ wetlands + forest + farm + otherUndeveloped + lagDevelopment,
              family = binomial(link = "logit"), data = datTrain)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Model3 <- glm(development_change ~ wetlands + forest + farm + otherUndeveloped + lagDevelopment + pop,
              family = binomial(link = "logit"), data = datTrain)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Model4 <- glm(development_change ~ wetlands + forest + farm + otherUndeveloped + lagDevelopment + pop + NAME,
              family = binomial(link = "logit"), data = datTrain)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Model5 <- glm(development_change ~ wetlands + forest + farm + otherUndeveloped + lagDevelopment + pop + distance_highways + NAME,
              family = binomial(link = "logit"), data = datTrain)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Model6 <- glm(development_change ~ wetlands + forest + farm + otherUndeveloped + pop + lagDevelopment * distance_highways + NAME,
              family = binomial(link = "logit"), data = datTrain)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# New logistic regression model including distance to proposed highway
Model2031 <- glm(development_change ~ wetlands + forest + farm + otherUndeveloped +
                   pop + lagDevelopment + distance_highways + distance_new_highway + NAME,
                 family = binomial(link = "logit"),
                 data = dat_2021)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(Model1)
## 
## Call:
## glm(formula = development_change ~ wetlands + forest + farm + 
##     otherUndeveloped, family = binomial(link = "logit"), data = datTrain)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -5.0999     0.3545 -14.385  < 2e-16 ***
## wetlands           0.8804     1.0679   0.824   0.4097    
## forest             1.8923     0.3729   5.075 3.87e-07 ***
## farm               0.8528     0.3816   2.235   0.0254 *  
## otherUndeveloped   2.4438     0.3629   6.735 1.64e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2795.0  on 9738  degrees of freedom
## Residual deviance: 2624.7  on 9734  degrees of freedom
## AIC: 2634.7
## 
## Number of Fisher Scoring iterations: 7
summary(Model2)
## 
## Call:
## glm(formula = development_change ~ wetlands + forest + farm + 
##     otherUndeveloped + lagDevelopment, family = binomial(link = "logit"), 
##     data = datTrain)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.459e+00  3.560e-01 -12.523   <2e-16 ***
## wetlands          2.639e+00  1.088e+00   2.425   0.0153 *  
## forest            3.621e+00  3.810e-01   9.506   <2e-16 ***
## farm              3.493e+00  3.954e-01   8.835   <2e-16 ***
## otherUndeveloped  4.505e+00  3.743e-01  12.038   <2e-16 ***
## lagDevelopment   -9.438e-04  6.088e-05 -15.504   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2795.0  on 9738  degrees of freedom
## Residual deviance: 2004.4  on 9733  degrees of freedom
## AIC: 2016.4
## 
## Number of Fisher Scoring iterations: 9
summary(Model3)
## 
## Call:
## glm(formula = development_change ~ wetlands + forest + farm + 
##     otherUndeveloped + lagDevelopment + pop, family = binomial(link = "logit"), 
##     data = datTrain)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.326e+00  4.727e-01 -11.266  < 2e-16 ***
## wetlands          3.211e+00  1.116e+00   2.878    0.004 ** 
## forest            4.249e+00  4.561e-01   9.316  < 2e-16 ***
## farm              4.169e+00  4.753e-01   8.771  < 2e-16 ***
## otherUndeveloped  5.142e+00  4.522e-01  11.369  < 2e-16 ***
## lagDevelopment   -8.898e-04  6.106e-05 -14.573  < 2e-16 ***
## pop               8.527e-04  2.105e-04   4.050 5.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2795.0  on 9738  degrees of freedom
## Residual deviance: 1991.6  on 9732  degrees of freedom
## AIC: 2005.6
## 
## Number of Fisher Scoring iterations: 9
summary(Model4)
## 
## Call:
## glm(formula = development_change ~ wetlands + forest + farm + 
##     otherUndeveloped + lagDevelopment + pop + NAME, family = binomial(link = "logit"), 
##     data = datTrain)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.569e+00  5.030e-01 -11.071  < 2e-16 ***
## wetlands          3.248e+00  1.112e+00   2.920 0.003499 ** 
## forest            4.202e+00  4.450e-01   9.442  < 2e-16 ***
## farm              4.258e+00  4.672e-01   9.114  < 2e-16 ***
## otherUndeveloped  5.029e+00  4.414e-01  11.393  < 2e-16 ***
## lagDevelopment   -8.855e-04  6.176e-05 -14.337  < 2e-16 ***
## pop               7.563e-04  2.224e-04   3.400 0.000673 ***
## NAMECaldwell     -7.407e-01  4.999e-01  -1.482 0.138436    
## NAMEHays          3.773e-02  2.673e-01   0.141 0.887752    
## NAMETravis        3.069e-01  2.417e-01   1.270 0.204193    
## NAMEWilliamson    7.161e-01  2.439e-01   2.936 0.003326 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2794.5  on 9731  degrees of freedom
## Residual deviance: 1967.1  on 9721  degrees of freedom
##   (7 observations deleted due to missingness)
## AIC: 1989.1
## 
## Number of Fisher Scoring iterations: 9
summary(Model5)
## 
## Call:
## glm(formula = development_change ~ wetlands + forest + farm + 
##     otherUndeveloped + lagDevelopment + pop + distance_highways + 
##     NAME, family = binomial(link = "logit"), data = datTrain)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -5.343e+00  5.042e-01 -10.596  < 2e-16 ***
## wetlands           3.086e+00  1.113e+00   2.773  0.00556 ** 
## forest             4.306e+00  4.440e-01   9.698  < 2e-16 ***
## farm               4.128e+00  4.661e-01   8.857  < 2e-16 ***
## otherUndeveloped   4.978e+00  4.407e-01  11.297  < 2e-16 ***
## lagDevelopment    -8.055e-04  6.327e-05 -12.731  < 2e-16 ***
## pop                6.725e-04  2.306e-04   2.916  0.00355 ** 
## distance_highways -1.723e-04  3.507e-05  -4.914 8.94e-07 ***
## NAMECaldwell      -8.363e-01  5.008e-01  -1.670  0.09491 .  
## NAMEHays           2.365e-01  2.696e-01   0.877  0.38047    
## NAMETravis         4.099e-01  2.426e-01   1.689  0.09114 .  
## NAMEWilliamson     7.150e-01  2.440e-01   2.931  0.00338 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2794.5  on 9731  degrees of freedom
## Residual deviance: 1936.9  on 9720  degrees of freedom
##   (7 observations deleted due to missingness)
## AIC: 1960.9
## 
## Number of Fisher Scoring iterations: 9
summary(Model6)
## 
## Call:
## glm(formula = development_change ~ wetlands + forest + farm + 
##     otherUndeveloped + pop + lagDevelopment * distance_highways + 
##     NAME, family = binomial(link = "logit"), data = datTrain)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -5.754e+00  5.162e-01 -11.148  < 2e-16 ***
## wetlands                          3.086e+00  1.112e+00   2.775 0.005524 ** 
## forest                            4.231e+00  4.428e-01   9.554  < 2e-16 ***
## farm                              4.086e+00  4.654e-01   8.780  < 2e-16 ***
## otherUndeveloped                  4.930e+00  4.398e-01  11.210  < 2e-16 ***
## pop                               7.309e-04  2.248e-04   3.251 0.001150 ** 
## lagDevelopment                   -5.368e-04  8.851e-05  -6.064 1.32e-09 ***
## distance_highways                 5.880e-05  6.831e-05   0.861 0.389353    
## NAMECaldwell                     -8.975e-01  5.002e-01  -1.794 0.072794 .  
## NAMEHays                          2.524e-01  2.696e-01   0.936 0.349315    
## NAMETravis                        3.927e-01  2.421e-01   1.622 0.104743    
## NAMEWilliamson                    6.825e-01  2.433e-01   2.806 0.005021 ** 
## lagDevelopment:distance_highways -1.287e-07  3.640e-08  -3.537 0.000405 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2794.5  on 9731  degrees of freedom
## Residual deviance: 1922.7  on 9719  degrees of freedom
##   (7 observations deleted due to missingness)
## AIC: 1948.7
## 
## Number of Fisher Scoring iterations: 11
summary(Model2031)
## 
## Call:
## glm(formula = development_change ~ wetlands + forest + farm + 
##     otherUndeveloped + pop + lagDevelopment + distance_highways + 
##     distance_new_highway + NAME, family = binomial(link = "logit"), 
##     data = dat_2021)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           7.408e-01  3.048e-01   2.431  0.01507 *  
## wetlands1            -2.280e+00  1.024e+00  -2.225  0.02606 *  
## forest1              -4.012e+00  4.034e-01  -9.946  < 2e-16 ***
## farm1                -1.865e+01  3.699e+02  -0.050  0.95979    
## otherUndeveloped1    -5.154e+00  5.999e-01  -8.591  < 2e-16 ***
## pop                  -1.900e-03  1.592e-04 -11.932  < 2e-16 ***
## lagDevelopment       -6.186e-04  9.904e-05  -6.246 4.21e-10 ***
## distance_highways    -4.672e-05  2.560e-05  -1.825  0.06803 .  
## distance_new_highway -2.575e-05  6.193e-06  -4.158 3.21e-05 ***
## NAMECaldwell         -1.904e-01  5.058e-01  -0.377  0.70653    
## NAMEHays              6.711e-01  2.585e-01   2.596  0.00943 ** 
## NAMETravis           -2.895e-01  2.453e-01  -1.180  0.23790    
## NAMEWilliamson        1.092e-01  2.770e-01   0.394  0.69328    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3896.8  on 13886  degrees of freedom
## Residual deviance: 1989.1  on 13874  degrees of freedom
##   (7 observations deleted due to missingness)
## AIC: 2015.1
## 
## Number of Fisher Scoring iterations: 20
library(ggplot2)
library(dplyr)

data.frame(
  Model = c("Model1", "Model2", "Model3", "Model4", "Model5", "Model6","Model2031"),
  AIC = c(Model1$aic, Model2$aic, Model3$aic, Model4$aic, Model5$aic, Model6$aic, Model2031$aic)
) %>%
  ggplot() +
  geom_bar(aes(x = Model, y = AIC), stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(title = "AIC Comparison of Urban Growth Models",
       y = "Akaike Information Criterion (AIC)")

Validating our Model Using the Test Set, Example on MODEL 6

testSetProbs <- data.frame(
  class = datTest$development_change,
  probs = predict(Model6, datTest, type = "response")
)

ggplot(testSetProbs, aes(probs)) +
  geom_density(aes(fill=class), alpha=0.5) +
  scale_fill_manual(values = palette2,
                    labels=c("No Change","New Development")) +
  labs(title = "Histogram of Test Set Predicted Probabilities",
       x = "Predicted Probabilities", y = "Density") +
  theme_minimal()

# Confusion matrix, Example on MODEL 6

# Fix the NA issue by reassigning directly from datTest
# Step 1: Recode reference class directly from datTest
testSetProbs$class <- as.numeric(datTest$development_change == "New Development")

# Step 2: Create predClass based on your threshold
testSetProbs$predClass <- ifelse(testSetProbs$probs > 0.05, 1, 0)

# Step 3: Convert both to factors with same levels
testSetProbs <- testSetProbs %>%
  mutate(
    class = factor(class, levels = c(0,1)),
    predClass = factor(predClass, levels = c(0,1))
  )

# Step 4: Confusion matrix
caret::confusionMatrix(
  reference = testSetProbs$class,
  data = testSetProbs$predClass,
  positive = "1"
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3395   22
##          1  656  100
##                                          
##                Accuracy : 0.8375         
##                  95% CI : (0.826, 0.8486)
##     No Information Rate : 0.9708         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1869         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.81967        
##             Specificity : 0.83806        
##          Pos Pred Value : 0.13228        
##          Neg Pred Value : 0.99356        
##              Prevalence : 0.02924        
##          Detection Rate : 0.02396        
##    Detection Prevalence : 0.18116        
##       Balanced Accuracy : 0.82887        
##                                          
##        'Positive' Class : 1              
## 

ROC Curve, Example on MODEL 6

# Recode reference values to 1 and 0
testSetProbs <- testSetProbs %>%
  mutate(class = as.numeric(datTest$development_change == "New Development"))

# Plot the ROC curve
library(ggplot2)
library(plotROC)

ggplot(testSetProbs, aes(d = class, m = probs)) + 
  geom_roc(n.cuts = 50, labels = FALSE) + 
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
  theme_minimal() +
  labs(title = "ROC Curve - Austin Urban Development Model")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Analyzing Error, Example on MODEL 6

# Load required libraries
library(dplyr)
library(sf)
library(kableExtra)
library(tidyr)
library(ggplot2)

# Create dat_2011_preds with predictions and classification labels
dat_2011_preds <- dat_2011 %>%
  mutate(
    development_change = case_when(
      development_change == "New Development" ~ 1,
      development_change == "No Change" ~ 0
    ),
    probs = predict(Model6, ., type = "response"),
    Threshold_5_Pct = as.numeric(probs >= 0.05),
    Threshold_17_Pct = as.numeric(probs >= 0.17)
  ) %>%
  mutate(
    confResult_05 = case_when(
      Threshold_5_Pct == 0 & development_change == 0 ~ "True_Negative",
      Threshold_5_Pct == 1 & development_change == 1 ~ "True_Positive",
      Threshold_5_Pct == 0 & development_change == 1 ~ "False_Negative",
      Threshold_5_Pct == 1 & development_change == 0 ~ "False_Positive"
    ),
    confResult_17 = case_when(
      Threshold_17_Pct == 0 & development_change == 0 ~ "True_Negative",
      Threshold_17_Pct == 1 & development_change == 1 ~ "True_Positive",
      Threshold_17_Pct == 0 & development_change == 1 ~ "False_Negative",
      Threshold_17_Pct == 1 & development_change == 0 ~ "False_Positive"
    )
  ) %>%
  st_as_sf()

# Inspect basic structure and values
table(dat_2011_preds$Threshold_5_Pct)
## 
##     0     1 
## 11319  2586
str(dat_2011_preds$Threshold_5_Pct)
##  num [1:13912] 0 0 0 0 0 0 0 0 0 0 ...
table(dat_2011$development_change)
## 
##       No Change New Development 
##           13473             439
str(dat_2011$development_change)
##  Factor w/ 2 levels "No Change","New Development": 1 1 1 1 1 1 1 1 1 1 ...
# Confusion summary table by county and threshold
dat_2011_preds %>%
  as.data.frame() %>%
  dplyr::select(NAME, confResult_05, confResult_17) %>%
  pivot_longer(cols = starts_with("confResult"),
               names_to = "Model_Type",
               values_to = "Confusion_Result") %>%
  group_by(NAME, Model_Type, Confusion_Result) %>%
  tally() %>%
  pivot_wider(names_from = Confusion_Result,
              values_from = n,
              values_fill = list(n = 0)) %>%
  mutate(
    TN_Rate_Specificity = 100 * (True_Negative / (True_Negative + False_Positive)),
    TP_Rate_Sensitivity = 100 * (True_Positive / (True_Positive + False_Negative))
  ) %>%
  dplyr::select(NAME, Model_Type, TN_Rate_Specificity, TP_Rate_Sensitivity) %>%
  kable() %>%
  kable_styling(full_width = FALSE)
NAME Model_Type TN_Rate_Specificity TP_Rate_Sensitivity
Bastrop confResult_05 92.11506 47.36842
Bastrop confResult_17 99.86464 10.52632
Caldwell confResult_05 99.61089 0.00000
Caldwell confResult_17 100.00000 0.00000
Hays confResult_05 81.89655 92.75362
Hays confResult_17 95.78040 52.17391
Travis confResult_05 68.83245 85.38012
Travis confResult_17 92.18338 45.02924
Williamson confResult_05 81.61680 87.09677
Williamson confResult_17 93.35443 60.00000
NA confResult_05 NaN NaN
NA confResult_17 NaN NaN
ggplot() +
  geom_sf(data = dat_2011_preds %>%
             st_centroid() %>%
             dplyr::select(confResult_05, confResult_17, geometry) %>%
             tidyr::pivot_longer(cols = starts_with("confResult"),
                                 names_to = "Threshold",
                                 values_to = "Classification") %>%
             filter(!is.na(Classification)),
           aes(colour = Classification)) +
  facet_wrap(~Threshold) +
  scale_colour_manual(
    values = c(
      "False_Negative" = "red",
      "False_Positive" = "yellow",
      "True_Negative" = "blue",
      "True_Positive" = "gray"
    ),
    name = "Prediction Result"
  ) +
  labs(title = "Spatial Distribution of Prediction Accuracy by Threshold") +
  theme_void()
## Warning: st_centroid assumes attributes are constant over geometries

# Prepare test set predictions for Model2031
dat_2021_preds <- dat_2021 %>%
  mutate(
    development_change = case_when(
      development_change == "New Development" ~ 1,
      development_change == "No Change" ~ 0
    ),
    probs = predict(Model2031, ., type = "response"),
    Threshold_5_Pct = as.numeric(probs >= 0.05),
    Threshold_17_Pct = as.numeric(probs >= 0.17)
  ) %>%
  mutate(
    confResult_05 = case_when(
      Threshold_5_Pct == 0 & development_change == 0 ~ "True_Negative",
      Threshold_5_Pct == 1 & development_change == 1 ~ "True_Positive",
      Threshold_5_Pct == 0 & development_change == 1 ~ "False_Negative",
      Threshold_5_Pct == 1 & development_change == 0 ~ "False_Positive"
    ),
    confResult_17 = case_when(
      Threshold_17_Pct == 0 & development_change == 0 ~ "True_Negative",
      Threshold_17_Pct == 1 & development_change == 1 ~ "True_Positive",
      Threshold_17_Pct == 0 & development_change == 1 ~ "False_Negative",
      Threshold_17_Pct == 1 & development_change == 0 ~ "False_Positive"
    )
  ) %>%
  st_as_sf()

# Recode for confusion matrix
testSetProbs_2031 <- dat_2021_preds %>%
  st_drop_geometry() %>%
  mutate(
    class = factor(development_change, levels = c(0,1)),
    predClass = factor(Threshold_5_Pct, levels = c(0,1))
  )

caret::confusionMatrix(
  reference = testSetProbs_2031$class,
  data = testSetProbs_2031$predClass,
  positive = "1"
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 12152    18
##          1  1296   421
##                                           
##                Accuracy : 0.9054          
##                  95% CI : (0.9004, 0.9102)
##     No Information Rate : 0.9684          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3582          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.95900         
##             Specificity : 0.90363         
##          Pos Pred Value : 0.24520         
##          Neg Pred Value : 0.99852         
##              Prevalence : 0.03161         
##          Detection Rate : 0.03032         
##    Detection Prevalence : 0.12364         
##       Balanced Accuracy : 0.93131         
##                                           
##        'Positive' Class : 1               
## 
# Make sure to load libraries
library(ggplot2)
library(dplyr)

# Generate predictions and attach to test set
testSetProbs_2031 <- data.frame(
  class = dat_2021$development_change,
  probs = predict(Model2031, dat_2021, type = "response")
)

# Plot density of predicted probabilities by true class
ggplot(testSetProbs_2031, aes(x = probs)) +
  geom_density(aes(fill = class), alpha = 0.5) +
  scale_fill_manual(values = palette2,
                    labels = c("No Change", "New Development")) +
  labs(
    title = "Histogram of test set predicted probabilities",
    x = "Predicted Probabilities",
    y = "Density"
  ) +
  theme_minimal()
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_density()`).

library(plotROC)

# Clean dataset before plotting ROC
testSetProbs_2031_clean <- testSetProbs_2031 %>%
  filter(!is.na(class), !is.na(probs))

testSetProbs_2031_clean <- testSetProbs_2031_clean %>%
  mutate(
    d = ifelse(class == "New Development", 1,
               ifelse(class == "No Change", 0, NA_real_))
  ) %>%
  filter(!is.na(d))  # just in case anything slipped through

library(plotROC)

ggplot(testSetProbs_2031_clean, aes(d = d, m = probs)) +
  geom_roc(n.cuts = 50, labels = FALSE) +
  style_roc(theme = theme_minimal()) +
  geom_abline(slope = 1, intercept = 0, size = 1.2, color = "gray") +
  labs(title = "ROC Curve - Model2031: Urban Development Forecast (Austin 2031)")

dat_2021_preds %>%
  as.data.frame() %>%
  dplyr::select(NAME, confResult_05, confResult_17) %>%
  pivot_longer(cols = starts_with("confResult"),
               names_to = "Model_Type",
               values_to = "Confusion_Result") %>%
  group_by(NAME, Model_Type, Confusion_Result) %>%
  tally() %>%
  pivot_wider(names_from = Confusion_Result,
              values_from = n,
              values_fill = list(n = 0)) %>%
  mutate(
    TN_Rate_Specificity = 100 * (True_Negative / (True_Negative + False_Positive)),
    TP_Rate_Sensitivity = 100 * (True_Positive / (True_Positive + False_Negative))
  ) %>%
  dplyr::select(NAME, Model_Type, TN_Rate_Specificity, TP_Rate_Sensitivity) %>%
  kable() %>%
  kable_styling(full_width = FALSE)
NAME Model_Type TN_Rate_Specificity TP_Rate_Sensitivity
Bastrop confResult_05 97.29272 100.00000
Bastrop confResult_17 97.36041 97.36842
Caldwell confResult_05 98.27682 83.33333
Caldwell confResult_17 99.11062 33.33333
Hays confResult_05 92.69510 97.10145
Hays confResult_17 94.41924 94.20290
Travis confResult_05 77.92509 92.98246
Travis confResult_17 88.66424 70.76023
Williamson confResult_05 89.69191 98.06452
Williamson confResult_17 93.66542 87.09677
NA confResult_05 NaN NaN
NA confResult_17 NaN NaN
ggplot() +
  geom_sf(data = dat_2021_preds %>%
             st_centroid() %>%
             dplyr::select(confResult_05, confResult_17, geometry) %>%
             tidyr::pivot_longer(cols = starts_with("confResult"),
                                 names_to = "Threshold",
                                 values_to = "Classification") %>%
             filter(!is.na(Classification)),
           aes(colour = Classification)) +
  facet_wrap(~Threshold) +
  scale_colour_manual(
    values = c(
      "False_Negative" = "red",
      "False_Positive" = "yellow",
      "True_Negative" = "blue",
      "True_Positive" = "gray"
    ),
    name = "Prediction Result"
  ) +
  labs(title = "Model2031: Spatial Distribution of Prediction Accuracy") +
  theme_void()
## Warning: st_centroid assumes attributes are constant over geometries

section 13

dat_2031_preds <- dat_2021 %>%
  mutate(
    probs = predict(Model2031, ., type = "response"),
    Prediction = as.factor(ifelse(probs >= 0.17, 1, 0))
  ) %>%
  st_as_sf()

ggplot(data = dat_2031_preds) +
  geom_point(aes(x = xyC(dat_2031_preds)[,1],
                 y = xyC(dat_2031_preds)[,2],
                 colour = Prediction)) +
  geom_sf(data = studyAreaCounties, fill = "transparent") +
  labs(title = "Development Predictions - 2031") +
  theme_void()
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries
## Warning: st_centroid assumes attributes are constant over geometries

section 14 impact assessment

dat_2031_preds %>%
  as.data.frame() %>%
  filter(Prediction == 1) %>%
  tally() %>%
  rename(total_cells = n) %>%
  mutate(
    total_area_m = total_cells * res(lc_2021_rs)[1],
    total_km2 = total_area_m / 1e6,
    total_mi2 = total_km2 * 0.386102
  ) %>%
  kable() %>%
  kable_styling()
total_cells total_area_m total_km2 total_mi2
1139 1025100 1.0251 0.3957932
dat_2031_preds %>%
  as.data.frame() %>%
  filter(Prediction == 1) %>%
  group_by(NAME) %>%
  tally() %>%
  rename(total_cells = n) %>%
  mutate(
    total_area_m = total_cells * res(lc_2021_rs)[1],
    total_km2 = total_area_m / 1e6,
    total_mi2 = total_km2 * 0.386102
  ) %>%
  kable() %>%
  kable_styling()
NAME total_cells total_area_m total_km2 total_mi2
Bastrop 115 103500 0.1035 0.0399616
Caldwell 18 16200 0.0162 0.0062549
Hays 188 169200 0.1692 0.0653285
Travis 463 416700 0.4167 0.1608887
Williamson 355 319500 0.3195 0.1233596
dat_2031_preds %>%
  as.data.frame() %>%
  filter(Prediction == 1) %>%
  dplyr::select(farm, otherUndeveloped, forest, wetlands, NAME) %>%
  pivot_longer(cols = -NAME, names_to = "Land_Cover_Type", values_to = "Value") %>%
  mutate(Value = as.numeric(Value)) %>%
  group_by(NAME, Land_Cover_Type) %>%
  summarize(total_cells = sum(Value, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_area_m = total_cells * res(lc_2021_rs)[1],
    total_km2 = total_area_m / 1e6,
    total_mi2 = total_km2 * 0.386102
  ) %>%
  kable() %>%
  kable_styling()
NAME Land_Cover_Type total_cells total_area_m total_km2 total_mi2
Bastrop farm 115 103500 0.1035 0.0399616
Bastrop forest 115 103500 0.1035 0.0399616
Bastrop otherUndeveloped 115 103500 0.1035 0.0399616
Bastrop wetlands 115 103500 0.1035 0.0399616
Caldwell farm 18 16200 0.0162 0.0062549
Caldwell forest 18 16200 0.0162 0.0062549
Caldwell otherUndeveloped 18 16200 0.0162 0.0062549
Caldwell wetlands 18 16200 0.0162 0.0062549
Hays farm 188 169200 0.1692 0.0653285
Hays forest 188 169200 0.1692 0.0653285
Hays otherUndeveloped 188 169200 0.1692 0.0653285
Hays wetlands 188 169200 0.1692 0.0653285
Travis farm 463 416700 0.4167 0.1608887
Travis forest 463 416700 0.4167 0.1608887
Travis otherUndeveloped 463 416700 0.4167 0.1608887
Travis wetlands 463 416700 0.4167 0.1608887
Williamson farm 355 319500 0.3195 0.1233596
Williamson forest 355 319500 0.3195 0.1233596
Williamson otherUndeveloped 355 319500 0.3195 0.1233596
Williamson wetlands 355 319500 0.3195 0.1233596